import pandas as pd
import numpy as np
import os
import src.gesis.etl as etl
from src.gesis.settings.variable_settings import variable_settings
import copy


PARTICIPANT_ID_LABEL = 'z000001a'
## static variable to instantiate once, call many. 
_wide = None
_value_labels_list = None
_long = None

def parse_varname(varname:str) -> (str,str):
    """
    GESIS has varnames of different formats for recruitment wavs and 
    regular waves. This retuns the wave name and relative variable 
    name from the absolute varname. See documentation of GESIS 
    variable names for details. 

    Args:
        varname (str): i.e.: 'aczs052a'

    Returns:
        tup(str,str): i.e.: 'ac','zs052a'
    """
    
    # test if varname is for a recruitment wave (ie 'a11', or a 
    # normal wave ie 'ab'). If something goes wrong, returns None. 
    wave = None
    var = None
    
    if str.isnumeric(varname[2]):
        wave = varname[0:3]
        var = varname[3:]
    else:
        wave = varname[0:2]
        var = varname[2:]
        
    return wave,var

def get_wave_from_varname(varname:str)->str:
    """Extension of parse_varname() which returns just the wave label"""
    return parse_varname(varname)[0]
    
    

def get_column_labels_from_substrings(
            df:pd.DataFrame, 
            substrings:list
        ) -> list:
    """Returns all column labels from df which contain any substring
    in the list substrings.

    Args:
        df (pd.DataFrame): df to get columns from
        substrings (list): list of substrings to check for. 

    Returns:
        list: _description_
    """
    return [col for col in df.columns if any(sub in col for sub in substrings)]

def get_flat_value_list_from_dict(d:dict) -> list:
    """
    When you have a dictionary with values that are both single values, 
    and lists of values, this returns a list of all component values 
    as single values. 

    Args:
        d (dict): the dictionary with values both singular and lists.

    Returns:
        list: list of single values
    """
    
    return [item if not isinstance(item, list) else item for value in d.values() for item in (value if isinstance(value, list) else [value])]
    

varnames_dict = variable_settings["VARNAMES"]

def _get_wide(
             dir  = 'data/sensitive/GESIS/ZA5665_GESIS_Panel_v44-0-0/data/stata'
             )->pd.DataFrame:
    """
    Reads contents of unzipped .dta file directory and
    concats all dataframes to a single wide format dataframe.

    Returns:
        pamdas.DataFrame]: GESIS data in wide format
    """
    dfs = []
    value_labels_list = []

    ## keep only columns from stata files which have these substrings
    keep_values = get_flat_value_list_from_dict(varnames_dict)
    keep_values.append(PARTICIPANT_ID_LABEL)

    ## skip the demographics file usually included is the stata dir
    include_list = [
        'ZA5665_a1_v44-0-0.dta',
        'ZA5665_d1_v44-0-0.dta',
        'ZA5665_f1_v44-0-0.dta'
    ]
   
    for filename in os.listdir(dir):
        if filename in include_list:#filename.endswith('.dta') and not filename in skip_list:
            file_path = os.path.join(dir, filename)
            ## get value2labels map from stat file
            with pd.io.stata.StataReader(file_path) as reader:
                value_labels = reader.value_labels()
                value_labels_list.append(value_labels)
            df = pd.read_stata(file_path,convert_categoricals=False)
            keep_cols = get_column_labels_from_substrings(df,keep_values)
            df = df[keep_cols]
            df = df.set_index(PARTICIPANT_ID_LABEL)
            dfs.append(df)
    wide = pd.concat(dfs,axis=0)
    return wide,value_labels_list

def get_wide(
             dir  = 'data/sensitive/GESIS/ZA5665_GESIS_Panel_v44-0-0/data/stata'
             )->pd.DataFrame:
    """
    Reads contents of unzipped .dta file directory and
    concats all dataframes to a single wide format dataframe.

    Returns:
        pamdas.DataFrame]: GESIS data in wide format
    """
    if type(etl._wide) == type(None):
        etl._wide, etl._value_labels_list = _get_wide(dir)
    return etl._wide, etl._value_labels_list
    



def get_flat_line_scores():
    wide,_ = get_wide()
    ## For numerical columns, remove <0 values and divide each by field by column max. 
    float_columns = wide.select_dtypes(include='float').columns
    wide = wide[float_columns] 
    wide= wide.applymap(lambda x: np.nan if x < 0 else x).div(wide.max())
    multi_index = pd.MultiIndex.from_frame(
        pd.DataFrame(
                {
                    'Wave': wide.columns.map(etl.get_wave_from_varname), 
                    'Variable': wide.columns
                }
            )
        )
    wide.columns = multi_index
    std = wide.T.groupby(level=0).std().T
    output = std.melt(ignore_index=False)
    output = output.reset_index().set_index(['Wave',PARTICIPANT_ID_LABEL])
    ## .get_flat_line_scores() returns the std of responses, so smaller value means more flat line, 
    ## Inverting values so that larger values mean flatter lining. 
    output = output.max() - output
    return output

def _get_long():

    wide, value_labels_list = etl.get_wide()

    # Each stata file stores categorical values as ints, but also has a
    # int2label map unique to each variable. We get each, use the most recent
    # int2label map for each varname, then apply each varname map to 
    # each wide columns.
    varnames_2_label_maps = {}

    for d in value_labels_list:
        varnames_2_label_maps.update(d)
        
    ## Avoid destructive changes to original wide singleton
    wide = copy.deepcopy(wide)

    for varname in varnames_2_label_maps.keys():
        if varname in wide.columns:
            ## copy to avoid recursive mapping
            wide.loc[:,varname] = wide.loc[:,varname].map(varnames_2_label_maps[varname]).copy(deep=True).values

    varnames_dict = variable_settings["VARNAMES"]
    long_columns = []
    for var_label in varnames_dict.keys():
        colnames = etl.get_column_labels_from_substrings(wide,varnames_dict[var_label])
        var_multi_indexed_series = wide[colnames].rename(columns=get_wave_from_varname).stack().rename_axis(index={1: lambda x: x[1]})
        var_multi_indexed_series =  var_multi_indexed_series.reset_index().rename(columns={'level_1':'Wave',0:var_label}).set_index(['Wave',etl.PARTICIPANT_ID_LABEL]).sort_index()
        #var_multi_indexed_series = var_multi_indexed_series.replace(-22.0,np.nan).replace(-11,np.nan)
        var_multi_indexed_series = var_multi_indexed_series.dropna()
        var_multi_indexed_series = var_multi_indexed_series.loc[~var_multi_indexed_series.index.duplicated(keep='first')]
        long_columns.append(var_multi_indexed_series)
        

    long = pd.concat(long_columns,axis=1).sort_index()
    return long

def get_long()->pd.DataFrame:
    """
    Executes the wide to long pipeline and returns
    a singeton instance of the long dataset
    Returns:
        pandas.DataFrame]: GESIS data in wide format
    """
    if type(etl._long) == type(None):
        etl._long = _get_long()
    return etl._long


def get_demographics(
        dir = 'data/sensitive/GESIS/ZA5665_GESIS_Panel_v44-0-0/data/stata/ZA5665_demography_v44-0-0.dta',
        helper= 'data/helpers/helper_wavedates.csv'
    ):
    labels_to_process = [
            'z000001a', 
            'wave', 
            'sex',
            'yob',
            'pinc', 
            'hhinc',
            'marstat', 
            'aapor',
            'hle',
            'cohort'
        ]
    data = pd.read_stata(
            dir, 
            columns=labels_to_process
        )
    wave_date_helper = pd.read_csv(helper).set_index('Wave')['Start'].to_dict()

    ## Don't map, preserve other into values
    yob_replacements = [
        ("<=1943",1943),
        (">=1995",1995)
    ]
    pinc_map = {
            np.nan:0,
            "1300 up to 1500 €":1300,
            "1500 up to 1700 €":1500,
            "900 up to 1100 €":900,
            "1100 up to 1300 €":1100,
            "700 up to 900 €":700,
            "2000 up to 2300 €":2000,
            "500 up to 700 €":500,
            "2300 up to 2600 €":2300,
            "3200 up to 4000 €":3200,
            "2600 up to 3200 €":2600,
            "5000 € and more":5000,
            "I don't know":0,
            "1700 up to 2000 €":1700,
            "300 up to 500 €":300,
            "Unter 300 €":300, ## to avoid conflating with missing or unemployed
            "I have no income of my own":0,
            "4000 up to 5000 €":4000
        }
    hhinc_map = {
            np.nan:0,
            "2300 up to 3200 €":2300,
            "1700 up to 2300 €":1700,
            "4000 up to 5000 €":4000,
            "900 up to 1300 €":900,
            "I don't know":0,
            "3200 up to 4000 €":3200,
            "6000 € and more":6000,
            "1300 up to 1700 €":1300,
            "Under 900 €":900,
            "ConnectionError5000 up to 6000 €":5000
    }

    marstat_map = {
        "Married/ R.P. living together":1,
        "Single":0,
        "Divorced/ R.P. Annulled":0,
        "Widowed/ R.P. died":0,
        "Married/ R.P. living apart":1
    }

    aapor_map = {
        "Complete": 0,
        "Nothing ever returned": 1,
        "Partial or break-off with sufficient information": 0,
        "Break-off: questionnaire too incomplete to process / break-off or partial with insufficient information": 1,
        "Post: Attempted - Addressee not known at place of address": 1,
        "Death (including Post: Deceased)": 1,
        "Explicit refusal": 1,
        "Post: Moved, left no address": 1,
        "Logged on to survey, did not complete any items": 1,
        "Email Bouncer: Postbox full": 1,
        "Email Bouncer: Delivery problem": 1,
        "Other person refusal": 1,
        "Explicit refusal with incentive": 1,
        "Blank questionnaire mailed back, implicit refusal": 1,
        "Blank questionnaire with incentive returned": 1,
        "Post: Undeliverable as addressed": 1,
        "Physically or mentally unable/incompetent": 1,
        "Explicit refusal no incentive": 1,
        "Known respondent-level refusal": 1,
        "Refusal": 1,
        "Email Bouncer: Mailbox unknown": 1,
        "Implicit refusal": 1,
        "Invitation returned undelivered (Email Bouncer)": 1,
        "Post: No Mail Receptacle": 1,
        "Respondent language problem": 1,
        "Postal box full": 1,
        "Blank questionnaire with no incentive returned": 1,
        "Returned from an unsampled person": 1
    }
    
    response_type_map = {
        "Complete": "Complete",
        "Nothing ever returned": "Refusal",
        "Partial or break-off with sufficient information": "Partial",
        "Break-off: questionnaire too incomplete to process / break-off or partial with insufficient information": "Partial",
        "Post: Attempted - Addressee not known at place of address": "Not_Reached",
        "Death (including Post: Deceased)": "Death_or_Incapacity",
        "Explicit refusal": "Refusal",
        "Post: Moved, left no address": "Not_Reached",
        "Logged on to survey, did not complete any items": "Refusal",
        "Email Bouncer: Postbox full": "Not_Reached",
        "Email Bouncer: Delivery problem": "Not_Reached",
        "Other person refusal": "Not_Reached",
        "Explicit refusal with incentive": "Refusal",
        "Blank questionnaire mailed back, implicit refusal": "Refusal",
        "Blank questionnaire with incentive returned": "Refusal",
        "Post: Undeliverable as addressed": "Not_Reached",
        "Physically or mentally unable/incompetent": "Death_or_Incapacity",
        "Explicit refusal no incentive": "Refusal",
        "Known respondent-level refusal": "Refusal",
        "Refusal": "Refusal",
        "Email Bouncer: Mailbox unknown":"Not_Reached",
        "Implicit refusal": "Refusal",
        "Invitation returned undelivered (Email Bouncer)":"Not_Reached",
        "Post: No Mail Receptacle":"Not_Reached",
        "Respondent language problem":"Not_Reached",
        "Postal box full":"Not_Reached",
        "Blank questionnaire with no incentive returned": "Refusal",
        "Returned from an unsampled person":"Not_Reached"
    }

    rename_map = {
        "z000001a":"Participant ID",
        "wave":"Wave"
    }
    education_map = {
        "General/subject-spec. univ. entrance qualif. (e.g., Abitur, EOS)":3,
        "Intermed. school leaving certif. (e.g., Realschule, Mittl. Reife)":2,
       "Secondary general school leaving certif. (e.g., Hauptschule)":2,
        "Left school without certif.":1,
        "Entrance qualif. univ. of appl. sciences (e.g., Fachobersch.)":3,
        "Graduation after a maximum of 7 years of school attendance (abroad)":3,
        "Student":1,
        "Other school leaving certif.":2
    }


    data['Wave_Date'] = pd.to_datetime(data['wave'].map(wave_date_helper),format="%d/%m/%Y")

    for yob in yob_replacements:
        data['yob'] = data['yob'].replace(yob[0],yob[1])
        
    data['Age'] = (data['Wave_Date'].dt.year - data['yob'].astype(float)).fillna(0)

    data['Household_Income'] = data['hhinc'].map(hhinc_map)
    data['Personal_Income'] = data['pinc'].map(pinc_map)
    data['Sex_Female'] = data['sex'] == 'Female'
    data['Household_Size'] = 0 ## Don't have it
    data['Is_Unemployed'] = 0 ## Don't have it
    data['Missing_Employment_Status'] = 0 ## Don't have it
    data['Is_Married'] = data['marstat'].map(marstat_map)
    data['Nonresponse_This_Wave'] = data['aapor'].map(aapor_map)
    data['Highest_Level_Education'] = data['hle'].map(education_map)
    data['Response_Type'] = data['aapor'].map(response_type_map)


    data = data.rename(columns=rename_map)
    data = data.set_index(["Wave","Participant ID"])

    data['NonResponse_Next_Wave'] = data['Nonresponse_This_Wave'].groupby(level=1).transform(lambda x: x.shift(-1))
    data['Historic_Nonresponse_Rate'] = data['Nonresponse_This_Wave'].groupby(level=1).transform(lambda x: x.expanding().mean()).fillna(0)

    data['Invited_Waves'] = data['Nonresponse_This_Wave'].groupby(level=1).transform(lambda x: x.expanding().count()).fillna(0)
    return data